import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, precision_score, recall_score
from sklearn.feature_selection import mutual_info_classif
from sklearn.tree import export_graphviz
from IPython.display import SVG, display
import graphviz
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from mpl_toolkits.mplot3d import Axes3D
!pip install graphviz
dataset = pd.read_csv('Dataset.txt', sep=',')
attack_types = pd.read_csv('Attack_types.txt', sep=' ')
dataset.head()
dataset.tail()
dataset.info()
dataset.describe()
attack_types.head()
attack_types.tail()
attack_types.describe()
attack_types.info()
# Add a Target Column (Attack Type in the Data Set)
attack_types_dict = attack_types.set_index('attack_category')['attack_type'].to_dict()
target_column_list = []
for attack_category in dataset['attack_category']:
if (attack_category == 'normal'):
target_column_list.append('normal')
else:
target_column_list.append(attack_types_dict[attack_category])
dataset['attack_type'] = target_column_list
dataset.head()
dataset.tail()
attack_types.head()
attack_types.tail()
attack_types_count = dataset['attack_type'].value_counts()
colors = ['red', 'blue', 'green', 'orange']
fig = plt.figure(figsize=(10,10))
plt.bar(attack_types_count.index, attack_types_count.values, color=colors)
plt.title('Attack Types Frequecies')
plt.xlabel('Attacks')
plt.ylabel('Frequency')
# save the figure as PNG Image
fig.savefig('attack_types_frequencies')
attack_categories_count = dataset['attack_category'].value_counts()
fig = plt.figure(figsize=(30,20))
plt.bar(attack_categories_count.index, attack_categories_count.values, color=colors)
plt.title('Attack Categories Frequecies')
plt.xlabel('Attack Categories')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
# save the figure as PNG Image
fig.savefig('attack_categories_frequencies')
dataset.head()
categorical_cols = ['protocol_type', 'service', 'flag', 'attack_category', 'attack_type']
label_encoder = LabelEncoder()
for category_col in categorical_cols:
dataset[category_col] = label_encoder.fit_transform(dataset[category_col])
dataset.head()
corr_matrix = dataset.corr()
fig, ax = plt.subplots(figsize=(40, 40))
cmap = sns.color_palette("plasma")
sns.heatmap(corr_matrix, cmap=cmap, center=0, square=True, annot=True, annot_kws={'fontsize': 12}, linewidths=.5, ax=ax)
ax.tick_params(axis='both', labelsize=14)
ax.set_title('Correlation Matrix', fontsize=18)
fig.savefig('correlation_matrix_of_dataset.png', dpi=300, bbox_inches='tight')
plt.show()
corr_matrix = dataset.corr()
corr_target = corr_matrix['attack_type']
top_5_cols = corr_target.abs().sort_values(ascending=False)[1:6]
if 'attack_type' in top_5_cols:
top_5_cols = top_5_cols.drop(target_var)
top_5_cols_using_corr = top_5_cols.index.tolist()
top_5_cols_using_corr
best_feautres_using_corr = dataset[top_5_cols_using_corr]
best_feautres_using_corr.head()
best_feautres_using_corr.describe()
best_feautres_using_corr.info()
corr_matrix = best_feautres_using_corr.corr()
fig, ax = plt.subplots(figsize=(40, 40))
cmap = sns.color_palette("plasma")
sns.heatmap(corr_matrix, cmap=cmap, center=0, square=True, annot=True, annot_kws={'fontsize': 12}, linewidths=.5, ax=ax)
ax.tick_params(axis='both', labelsize=14)
ax.set_title('Correlation Matrix', fontsize=18)
fig.savefig('correlation_matrix_of_features_selected_using_corr.png', dpi=300, bbox_inches='tight')
plt.show()
X = best_feautres_using_corr
y = dataset['attack_type']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape
X_test.shape
y_train.shape
y_test.shape
y_test.head()
y_train.head()
X_train.head()
X_test.head()
X_test.tail()
X_train.describe()
X_test.describe()
clf_using_corr_entropy = DecisionTreeClassifier(criterion='entropy', random_state=42)
clf_using_corr_entropy.fit(X_train, y_train)
y_pred = clf_using_corr_entropy.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)
cm = confusion_matrix(y_test, y_pred)
# calculate F1 score
f1 = f1_score(y_test, y_pred, average='weighted')
print("F1 Score:", f1)
# calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
# calculate error rate
error_rate = 1 - accuracy
print("Error Rate:", error_rate)
# plot confusion matrix
sns.heatmap(cm, annot=True, cmap="Blues")
plt.title("Confusion Matrix")
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.savefig("cm_dt_corr_entropy.png")
plt.show()
# Visualize the decision tree
dot_data = export_graphviz(clf_using_corr_entropy, out_file=None,
feature_names=top_5_cols_using_corr,
class_names='attack_type',
filled=True, rounded=True,
special_characters=True)
# graph = graphviz.Source(dot_data)
# graph.render("iris_decision_tree")
graph = graphviz.Source(dot_data)
display(SVG(graph.pipe(format='svg')))
X = best_feautres_using_corr
y = dataset['attack_type']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
clf_corr_using_gini = DecisionTreeClassifier(criterion='gini', random_state=42)
clf_corr_using_gini.fit(X_train, y_train)
y_pred = clf_corr_using_gini.predict(X_test)
# calculate F1 score
cm = confusion_matrix(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
print("F1 Score:", f1)
# calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
# calculate error rate
error_rate = 1 - accuracy
print("Error Rate:", error_rate)
# plot confusion matrix
sns.heatmap(cm, annot=True, cmap="Blues")
plt.title("Confusion Matrix")
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.savefig("cm_dt_corr_gini.png")
plt.show()
# Visualize the decision tree
dot_data = export_graphviz(clf_corr_using_gini, out_file=None,
feature_names=top_5_cols_using_corr,
class_names='attack_type',
filled=True, rounded=True,
special_characters=True)
# graph = graphviz.Source(dot_data)
# graph.render("iris_decision_tree")
graph = graphviz.Source(dot_data)
display(SVG(graph.pipe(format='svg')))
# extract the features and target variable
X = dataset.drop('attack_type', axis=1)
y = dataset['attack_type']
# calculate the information gain of each feature
info_gain = mutual_info_classif(X, y)
# create a dataframe to store the information gain of each feature
ig_df = pd.DataFrame({'feature': X.columns, 'info_gain': info_gain})
# sort the dataframe by information gain in descending order
ig_df = ig_df.sort_values(by='info_gain', ascending=False)
# select the top five features based on information gain
top_5_cols_using_ig = list(ig_df['feature'][:5])
print("Top 5 features based on Information Gain:", top_5_cols_using_ig)
best_features_using_ig = dataset[top_5_cols_using_ig]
best_features_using_ig.head()
best_features_using_ig.tail()
best_features_using_ig.describe()
best_features_using_ig.info()
best_features_using_ig.shape
X = best_features_using_ig
y = dataset['attack_type']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape
X_test.shape
y_train.shape
y_test.shape
X_train.head()
X_test.head()
clf_using_ig_entropy = DecisionTreeClassifier(criterion='entropy', random_state=42)
clf_using_ig_entropy.fit(X_train, y_train)
y_pred = clf_using_ig_entropy.predict(X_test)
# calculate F1 score
cm = confusion_matrix(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
print("F1 Score:", f1)
# calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
# calculate error rate
error_rate = 1 - accuracy
print("Error Rate:", error_rate)
# plot confusion matrix
sns.heatmap(cm, annot=True, cmap="Blues")
plt.title("Confusion Matrix")
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.savefig("cm_dt_ig_entropy.png")
plt.show()
# Visualize the decision tree
dot_data = export_graphviz(clf_using_ig_entropy, out_file=None,
feature_names=top_5_cols_using_ig,
class_names='attack_type',
filled=True, rounded=True,
special_characters=True)
# graph = graphviz.Source(dot_data)
# graph.render("iris_decision_tree")
graph = graphviz.Source(dot_data)
display(SVG(graph.pipe(format='svg')))
X = best_features_using_ig
y = dataset['attack_type']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
clf_using_ig_gini = DecisionTreeClassifier(criterion='gini', random_state=42)
clf_using_ig_gini.fit(X_train, y_train)
y_pred = clf_using_ig_gini.predict(X_test)
# calculate F1 score
cm = confusion_matrix(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
print("F1 Score:", f1)
# calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
# calculate error rate
error_rate = 1 - accuracy
print("Error Rate:", error_rate)
# plot confusion matrix
sns.heatmap(cm, annot=True, cmap="Blues")
plt.title("Confusion Matrix")
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.savefig("cm_dt_ig_gini.png")
plt.show()
# Visualize the decision tree
dot_data = export_graphviz(clf_using_ig_gini, out_file=None,
feature_names=top_5_cols_using_ig,
class_names='attack_type',
filled=True, rounded=True,
special_characters=True)
# graph = graphviz.Source(dot_data)
# graph.render("iris_decision_tree")
graph = graphviz.Source(dot_data)
display(SVG(graph.pipe(format='svg')))
X = best_features_using_ig
y = dataset['attack_type']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape
X_train.head()
X_test.head()
# Normalize the input features
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
X_train
X_test
y_train.head()
y_test.head()
model_based_on_ig_features = MLPClassifier(hidden_layer_sizes=(10, 5), activation='relu', solver='adam', max_iter=1000, random_state=1)
model_based_on_ig_features.fit(X_train, y_train)
y_pred = model_based_on_ig_features.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, cmap="Blues")
plt.title("Confusion Matrix")
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.savefig("cm_ann_ig.png")
plt.show()
# Calculate accuracy, precision, recall, and F1 score
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
error_rate = 1 - accuracy
# Print the results
print("Accuracy: {:.3f}".format(accuracy))
print("Precision: {:.3f}".format(precision))
print("Recall: {:.3f}".format(recall))
print("F1 Score: {:.3f}".format(f1))
print("Error Rate: {:.3f}".format(error_rate))
X = best_feautres_using_corr
y = dataset['attack_type']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Normalize the input features
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
X_train
X_test
y_train.head()
y_test.head()
model_based_on_corr_features = MLPClassifier(hidden_layer_sizes=(10, 5), activation='logistic', solver='adam', max_iter=1000, random_state=1)
model_based_on_corr_features.fit(X_train, y_train)
y_pred = model_based_on_ig_features.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
# plot confusion matrix
sns.heatmap(cm, annot=True, cmap="Blues")
plt.title("Confusion Matrix")
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.savefig("cm_ann_corr.png")
plt.show()
# Calculate accuracy, precision, recall, and F1 score
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
# Print the results
print("Accuracy: {:.3f}".format(accuracy))
print("Precision: {:.3f}".format(precision))
print("Recall: {:.3f}".format(recall))
print("F1 Score: {:.3f}".format(f1))
X = best_feautres_using_corr
y = dataset['attack_type']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Create a KNN model with k=3
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
# Evaluate the model's accuracy
accuracy = knn.score(X_test, y_test)
print("Accuracy:", accuracy)
X = best_feautres_using_corr
y = dataset['attack_type']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Create a KNN model
knn = KNeighborsClassifier()
# Define the grid of hyperparameters to search over
param_grid = {'n_neighbors': [1, 3, 5, 7, 9, 11]}
# Create a grid search object
grid_search = GridSearchCV(knn, param_grid, cv=5)
# Train the grid search object using the training set
grid_search.fit(X_train, y_train)
# Print the best parameters and best score
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)
knn = KNeighborsClassifier(n_neighbors=11)
# Train the model using the training set
knn.fit(X_train, y_train)
# Make predictions on the testing set
y_pred = knn.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
# plot confusion matrix
sns.heatmap(cm, annot=True, cmap="Blues")
plt.title("Confusion Matrix")
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.savefig("cm_knn_corr.png")
plt.show()
# Calculate accuracy, precision, recall, and F1 score
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
# Print the results
print("Accuracy: {:.3f}".format(accuracy))
print("Precision: {:.3f}".format(precision))
print("Recall: {:.3f}".format(recall))
print("F1 Score: {:.3f}".format(f1))
X = best_features_using_ig
y = dataset['attack_type']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Create a KNN model
knn = KNeighborsClassifier()
# Define the grid of hyperparameters to search over
param_grid = {'n_neighbors': [1, 3, 5, 7, 9, 11]}
# Create a grid search object
grid_search = GridSearchCV(knn, param_grid, cv=5)
# Train the grid search object using the training set
grid_search.fit(X_train, y_train)
# Print the best parameters and best score
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)
knn = KNeighborsClassifier(n_neighbors=1)
# Train the model using the training set
knn.fit(X_train, y_train)
# Make predictions on the testing set
y_pred = knn.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
# plot confusion matrix
sns.heatmap(cm, annot=True, cmap="Blues")
plt.title("Confusion Matrix")
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.savefig("cm_knn_ig.png")
plt.show()
# Calculate accuracy, precision, recall, and F1 score
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
# Print the results
print("Accuracy: {:.3f}".format(accuracy))
print("Precision: {:.3f}".format(precision))
print("Recall: {:.3f}".format(recall))
print("F1 Score: {:.3f}".format(f1))
# Create a scaler object
scaler = StandardScaler()
# Fit and transform the data
scaled_data = scaler.fit_transform(best_feautres_using_corr)
sns.set_style('darkgrid')
%config InlineBackend.figure_format = 'retina'
# Visualize the correlation your data and identify variables for further analysis
g = sns.PairGrid(best_feautres_using_corr)
g.map(sns.scatterplot);
fig, ax = plt.subplots(figsize=(20, 15))
# Determine optimal cluster number with elbow method
wcss = []
for i in range(1, 11):
model = KMeans(n_clusters = i,
init = 'k-means++', # Initialization method for kmeans
max_iter = 300, # Maximum number of iterations
n_init = 10, # Choose how often algorithm will run with different centroid
random_state = 0) # Choose random state for reproducibility
model.fit(scaled_data)
wcss.append(model.inertia_)
# Show Elbow plot
ax.plot(range(1, 11), wcss)
ax.set_title('Elbow Method') # Set plot title
ax.set_xlabel('Number of clusters') # Set x axis name
ax.set_ylabel('Within Cluster Sum of Squares (WCSS)') # Set y axis name
# Save the plot as a PNG image
plt.savefig('elbow_plot.png', dpi=300)
plt.show()
# Convert dataframe to numpy array
data = scaled_data
# Set the number of clusters
k = 4
# Initialize the k-means algorithm
kmeans = KMeans(n_clusters=k, random_state=0)
# Fit the algorithm to the data
labels = kmeans.fit_predict(data)
# Visualize the clusters in 3D
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(data[:, 0], data[:, 1], data[:, 2], c=labels, cmap='rainbow')
ax.set_xlabel('Feature 1')
ax.set_ylabel('Feature 2')
ax.set_zlabel('Feature 3')
plt.title('K-Means Clustering with k=4')
# Save the graph
plt.savefig('kmeans_cluster.png')
# Show the graph
plt.show()
best_features_using_ig.head()
# Create a scaler object
scaler = StandardScaler()
# Fit and transform the data
scaled_data = scaler.fit_transform(best_features_using_ig)
# Visualize the correlation your data and identify variables for further analysis
g = sns.PairGrid(best_features_using_ig)
g.map(sns.scatterplot);
fig, ax = plt.subplots(figsize=(20, 15))
# Determine optimal cluster number with elbow method
wcss = []
for i in range(1, 11):
model = KMeans(n_clusters = i,
init = 'k-means++', # Initialization method for kmeans
max_iter = 300, # Maximum number of iterations
n_init = 10, # Choose how often algorithm will run with different centroid
random_state = 0) # Choose random state for reproducibility
model.fit(scaled_data)
wcss.append(model.inertia_)
# Show Elbow plot
ax.plot(range(1, 11), wcss)
ax.set_title('Elbow Method') # Set plot title
ax.set_xlabel('Number of clusters') # Set x axis name
ax.set_ylabel('Within Cluster Sum of Squares (WCSS)') # Set y axis name
# Save the plot as a PNG image
plt.savefig('elbow_plot.png', dpi=300)
plt.show()
# Convert dataframe to numpy array
data = scaled_data
# Set the number of clusters
k = 8
# Initialize the k-means algorithm
kmeans = KMeans(n_clusters=k, random_state=0)
# Fit the algorithm to the data
labels = kmeans.fit_predict(data)
# Visualize the clusters in 3D
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(data[:, 0], data[:, 1], data[:, 2], c=labels, cmap='rainbow')
ax.set_xlabel('Feature 1')
ax.set_ylabel('Feature 2')
ax.set_zlabel('Feature 3')
plt.title('K-Means Clustering with k=8')
# Save the graph
plt.savefig('kmeans_cluster_1.png')
plt.show()
dataset_kmeans = dataset.drop('attack_type', axis=1)
dataset_kmeans.head()
# Instantiate PCA
pca = PCA(n_components=3)
# Fit the PCA to the data
pca.fit(dataset_kmeans)
# Transform the data using the PCA
transformed_data = pca.transform(dataset_kmeans)
# Create a new DataFrame with the transformed data and column names
df_transformed = pd.DataFrame(data=transformed_data, columns=['Feature 1', 'Feature 2', 'Feature 3'])
# Print the explained variance ratio of the PCA
print('Explained variance ratio:', pca.explained_variance_ratio_)
df_transformed.head()
# Create a scaler object
scaler = StandardScaler()
# Fit and transform the data
scaled_data = scaler.fit_transform(df_transformed)
scaled_data
fig, ax = plt.subplots(figsize=(20, 15))
# Determine optimal cluster number with elbow method
wcss = []
for i in range(1, 11):
model = KMeans(n_clusters = i,
init = 'k-means++', # Initialization method for kmeans
max_iter = 300, # Maximum number of iterations
n_init = 10, # Choose how often algorithm will run with different centroid
random_state = 0) # Choose random state for reproducibility
model.fit(scaled_data)
wcss.append(model.inertia_)
# Show Elbow plot
ax.plot(range(1, 11), wcss)
ax.set_title('Elbow Method') # Set plot title
ax.set_xlabel('Number of clusters') # Set x axis name
ax.set_ylabel('Within Cluster Sum of Squares (WCSS)') # Set y axis name
# Save the plot as a PNG image
plt.savefig('elbow_plot.png', dpi=300)
plt.show()
# Apply K-means clustering with k=6
kmeans = KMeans(n_clusters=6, random_state=0)
labels = kmeans.fit_predict(scaled_data)
# Visualize the clusters with a scatter plot of three features
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(scaled_data[:, 0], scaled_data[:, 1], scaled_data[:, 2], c=labels, cmap='rainbow')
ax.set_xlabel('Feature 1')
ax.set_ylabel('Feature 2')
ax.set_zlabel('Feature 3')
plt.title('K-Means Clustering with k=6')
# Save the graph
plt.savefig('kmeans_cluster_2.png')
plt.show()